#Import all the necessary modules
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
%matplotlib inline
data_df = pd.read_csv("vehicle-1.csv")
data_df.head(5)
# Replacing categorical variable by integer values. Then, converting string datatype into integer
data_df.replace (to_replace='car',value='0',inplace=True)
data_df.replace (to_replace='van',value='1',inplace=True)
data_df.replace (to_replace='bus',value='2',inplace=True)
data_df["class"]= data_df["class"].astype(int)
pd.value_counts(data_df["class"]).plot(kind="bar")
data_df.dtypes
# We could see "?" values in column, this should be removed from data set
# Check for missing value in any other column
data_df.isna().sum()
# Bare nuclei is taken as object typeeven though it has numerical values.
data_df.describe()
# All missing values are replaced by mean values of corresponding columnss
data_df.fillna(data_df.mean(),inplace=True)
# Below command shows there are no missing values after they are being replaced by corresponding Median values of columns
data_df.isna().sum()
data_df.boxplot(figsize=(20,3))
# 3 outliers in radius_ratio
sns.boxplot(data=data_df.radius_ratio)
data_df["radius_ratio"] = data_df["radius_ratio"].mask(data_df["radius_ratio"] > 255, data_df['radius_ratio'].median())
# 3 outliers removed
sns.boxplot(data=data_df.radius_ratio)
# 8 outliers found in axis_aspect_ratio
sns.boxplot(data=data_df.axis_aspect_ratio)
data_df["axis_aspect_ratio"] = data_df["axis_aspect_ratio"].mask(data_df["axis_aspect_ratio"] > 95, data_df['axis_aspect_ratio'].median())
# 8 outliers fixed in axis_aspect_ratio
sns.boxplot(data=data_df.axis_aspect_ratio)
# Approx 12-15 outliers fixed exxcept value which is 97. This can be replaced by 2SD value because it is close to tail
sns.boxplot(data=data_df.length_aspect_ratio)
data_df["length_aspect_ratio"] = data_df["length_aspect_ratio"].mask(data_df["length_aspect_ratio"] > 18, data_df['length_aspect_ratio'].median())
data_df["length_aspect_ratio"] = data_df["length_aspect_ratio"].mask(data_df["length_aspect_ratio"] <3, data_df['length_aspect_ratio'].median())
sns.boxplot(data=data_df.length_aspect_ratio)
# All outliers are fixed
# 1 outlier found
sns.boxplot(data=data_df.scaled_variance)
data_df["scaled_variance"] = data_df["scaled_variance"].mask(data_df["scaled_variance"] >300, data_df['scaled_variance'].median())
# 1 outlier fixed
sns.boxplot(data=data_df.scaled_variance)
# 2 outliers found
sns.boxplot(data=data_df.scaled_variance_1)
data_df["scaled_variance_1"] = data_df["scaled_variance_1"].mask(data_df["scaled_variance_1"] >987, data_df['scaled_variance_1'].median())
# 2 outliers fixed
sns.boxplot(data=data_df.scaled_variance_1)
# 9 outliers found
sns.boxplot(data=data_df.scaled_radius_of_gyration_1)
data_df["scaled_radius_of_gyration_1"] = data_df["scaled_radius_of_gyration_1"].mask(data_df["scaled_radius_of_gyration_1"] >87, data_df['scaled_radius_of_gyration_1'].median())
# 9 outliers fixed
sns.boxplot(data=data_df.scaled_radius_of_gyration_1)
sns.boxplot(data=data_df.skewness_about)
data_df["skewness_about"] = data_df["skewness_about"].mask(data_df["skewness_about"] >=20, data_df['skewness_about'].median())
data_df["skewness_about_1"] = data_df["skewness_about_1"].mask(data_df["skewness_about_1"] >40, data_df['skewness_about_1'].median())
sns.boxplot(data=data_df.skewness_about)
# Outliers removed
# Pairplot after fixing missing values.
sns.pairplot(data_df,diag_kind='kde')
colormap = plt.cm.viridis # Color range to be used in heatmap
plt.figure(figsize=(20,20))
plt.title('Pearson Correlation of attributes', y=1.05, size=20)
sns.heatmap(data_df.corr(),linewidths=0.1,vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
## Following independent attributes show direct linear relationship hence one of them can be dropped to make model simpler and lighter
# 1) skewness_about_2 - hollows_ratio
# 2) scaled_variance - scaled_variance_1
# 3) scaled_variance_1 - axis_rectangularity
# 4) scaled_variance_1 - scatter_ratio
# 5) scaled_variance - scatter_ratio
# 6) scaled_variance - axis_rectangularity
# 7) length_rectangularity - circularity
# 8) axis_rectangularity - scatter_ratio
# 9) scatter_ratio - distance_circularity
#10) circularity - length_rectangularity
#11) circularity - scaled_radius_of_gyration
# Dropping following columns as they have linear relationship with couple of other columns.Hence it doesn't make sense
# to keep these columns which will give redundant behaviour as far as relationship is concerned
# scaled_variance_1
# axis_rectangularity
data_df = data_df.drop(['scaled_variance_1'], axis =1)
data_df = data_df.drop(['axis_rectangularity'], axis =1)
data_df.head()
# Let us now remove duplicate/irrelevant columns
#cars_new = data_df.drop(['class'], axis =1)
data_df.head()
# No duplicate rows
print('Number of rows before discarding duplicates = %d' % (data_df.shape[0]))
data_df.drop_duplicates(subset = None, keep = 'first', inplace=True)
print('Number of rows after discarding duplicates = %d' % (data_df.shape[0]))
from scipy.stats import zscore
data_df.dtypes
numeric_cols = data_df.select_dtypes(include=[np.int64, np.float64]).columns
numeric_cols
data_df[numeric_cols] =data_df[numeric_cols].apply(zscore)
data_df.head()
# independant variables
X = data_df.drop(['class'], axis=1)
# the dependent variable
y = data_df[['class']]
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
# calculate accuracy measures and confusion matrix
from sklearn import metrics
clf = svm.SVC(gamma=0.025, C=3)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=2)
# gamma is a measure of influence of a data point. It is inverse of distance of influence. C is complexity of the model
# lower C value creates simple hyper surface while higher C creates complex surface
clf.fit(X_train , Y_train)
Y_pred = clf.predict(X_test)
print("Accuracy on training set: {:.2f}".format(clf.score(X_train, Y_train)))
print("Accuracy on test set: {:.2f}".format(clf.score(X_test, Y_test)))
print(metrics.classification_report(Y_test, Y_pred))
print(metrics.confusion_matrix(Y_test, Y_pred))
data_df.head(2)
covMatrix = np.cov(X,rowvar=False)
pca = PCA(n_components=16)
pca.fit(X)
Eigen values and vectors
print(pca.explained_variance_) # Eigen values
print(pca.components_) # Eigen vectors
The percentage of variation explained by each eigen Vector
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,17)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,17)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
pca3 = PCA(n_components=6)
pca3.fit(X)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(X)
sns.pairplot(pd.DataFrame(Xpca3))
# After dimension reduction, we have total 6 attributes
# Use SVM to classify class (y) of vehicles with PCA
from sklearn import svm
from sklearn.model_selection import train_test_split, cross_val_score
# calculate accuracy measures and confusion matrix
from sklearn import metrics
clf = svm.SVC(gamma=0.025, C=3)
X_train, X_test, Y_train, Y_test = train_test_split(Xpca3, y, test_size=0.3, random_state=2)
# gamma is a measure of influence of a data point. It is inverse of distance of influence. C is complexity of the model
# lower C value creates simple hyper surface while higher C creates complex surface
clf.fit(X_train , Y_train)
Y_pred = clf.predict(X_test)
print("Accuracy on training set: {:.2f}".format(clf.score(X_train, Y_train)))
print("Accuracy on test set: {:.2f}".format(clf.score(X_test, Y_test)))
print(metrics.classification_report(Y_test, Y_pred))
print(metrics.confusion_matrix(Y_test, Y_pred))
Accuracy of Train/Test data without PCA + SVM is 98% and 95% respectively. However, Accuracy of Train/Test with PCA + SVM technique is 91% and 90% respectively.
It is very clear that without PCA, we end up getting noise and fitting noise data which increases the training accuracy but test accuracy relatively reduces. As a result, Training model get overfitted because of fitting noise data. Same noise doesn't come in production and hence test accuracy reduces.
However, with PCA there is less chances of overfitting and in turn its reduces the accuracy % but also reduces noise level and overfitting of data in the model
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear', 'rbf'), 'C':[0.01,0.05,0.5,1]}
svc = svm.SVC(gamma="scale")
clf = GridSearchCV(svc, parameters, cv=5)
clf
As the value of C increase from 0.01 through 1 then accuracy increases accordingly
# scikit-learn k-fold cross-validation
from sklearn.model_selection import cross_val_score
# prepare cross validation
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# Variables are mow scaled. Let us now try to create clusters
from sklearn.cluster import KMeans
cluster_range = range(1,15)
cluster_errors = []
for num_clusters in cluster_range:
clusters = KMeans(num_clusters, n_init = 5)
clusters.fit(X)
labels = clusters.labels_
centroids = clusters.cluster_centers_
cluster_errors.append(clusters.inertia_)
clusters_df = pd.DataFrame({"num_clusters": cluster_range, "cluster_errors": cluster_errors})
clusters_df[0:15]
from matplotlib import cm
plt.figure(figsize=(12,6))
plt.plot( clusters_df.num_clusters, clusters_df.cluster_errors, marker = "o" )
# We could see the bend at 4, so let us create 4 custers
kmeans = KMeans(n_clusters=4, n_init = 5, random_state=12345)
kmeans.fit(X)
# Check the number of data in each cluster
labels = kmeans.labels_
counts = np.bincount(labels[labels>=0])
print(counts)
# Distribution looks fine.
# let us check the centers in each group
centroids = kmeans.cluster_centers_
centroid_df = pd.DataFrame(centroids, columns = list(X) )
centroid_df.transpose()
# Add cluster number to original cars data
predictions = kmeans.predict(X)
predictions
data_df["group"] = predictions
data_df['group'] = data_df['group'].astype('category')
data_df.dtypes
data_df.head(5)
# Visualize the centers
data_df["group"] = predictions
data_df.boxplot(by = 'group', layout=(5,4), figsize=(15, 10))
#Group 0 - Characterised by lower axis aspect ratio, lower compactness,lower skewness and radius ratio
# Group 1 - Characterised by relatively higher axis aspect ratio, lower compactness,lower skewness and radius ratio then Group 1
# Group 2 - They are highly compact,higher scatter ratio, high variance but least radius ratio, average skewness
# Group 3 - Highly skew,medium scatter ratio,average compactness and radius ratio.Least circularity among all groups